#note: data and graphs appear roughly in the order they appear in the paper.
#some code is redundant and may involve more steps than strictly necessary, 
    #but this is done for for the sake of clarity.

rm(list=ls(all=TRUE))
library(arm)
library(foreign)

#get average district and total vote time series

#first get dem seats and total vote for 1946--2006; unit of observation here is the election year (i.e. "aggregate")
agg.house.data <- read.dta("House_1946_2006_aggregate.dta", convert.underscore = T)
attach.all(agg.house.data)
unique.year.all <- unique(year)#create variable for each uniqye year
dem.seats.actual <- dem.seats
dem.seats <- dem.seats.per#Dem percentage of seats in House stemming from each election
#get total vote for 1946-2006
dem.per.total <- dem.per.total#dems percentage of total vote in each election

#get average district vote for all years
###########################################
house.4606 <- read.dta("House_1946_2006_updated.dta", convert.underscore = T) #1946-2006 data -- statecd is unit
house.4606 <- house.4606[house.4606$winner < 9,]#drop handful of 3rd party candidates
                        #not-including Bernie Sanders, who we count as a Democrat
detach()
attach.all(house.4606)
unique.year.4606 <- unique(year)#create year var for 1946-2004 (helps avoid confusion when looping)
adv <- rep(NA, length(unique.year.4606))#create empty vector for imputed avg. district vote, 1946-2006
for (i in 1:length(unique.year.4606)){
    adv[i] <- mean(dvoteimputed[year==unique.year.4606[i]])#get adv for each year, 1946-2004
}


#use regressions from 1946-2006 (in non-redistricting years) 
  #to get coefficients to set parameter estimates below

house.4606no02 <- house.4606[year !=1952 & year != 1962 & year != 1972 
    & year !=1982 & year != 1992 & year != 2002,]
detach()
attach.all(house.4606no02)
unique.year.4606no02 <- unique(year)
coefs <- array(NA, c(length(unique.year.4606no02 ), 4)) #create empty matrix to store coefficients
resid.errors <- rep(NA, length(unique.year.4606no02 )) #empty vector to store residual errors
for (i in 1:length(unique.year.4606no02 )){#loop over each 
    fit <- lm(dvoteimputed ~ dlagimputed + incumb +  
        partycontrol, subset = year == unique.year.4606no02[i])
    coefs[i,] <- coef(fit)
    resid.errors[i] <- sigma.hat(fit)
        }
start.year.indicator <- 21#1996
end.year.indicator <- 25#2006
rho <-  mean(coefs[start.year.indicator:end.year.indicator,2])#get rho by taking mean coef from 5 years leading up to election years: 
sigma <- mean(resid.errors[start.year.indicator:end.year.indicator])#get mean residual standard error from past 5 years
phi <- .08

#these come from yearly regressions in above loop;
  #i've commented out to save from having to run each time I run script
  #note these estimates are identical to those from 2006 paper
rmse.predict <- .022
rmse <- .0097
phi <- .08#estimated incumbency advantage
#thse also come from yearly regressions; while they're are based on rolling averages,
  #basically the same for 2006 and 2008, so define here for convience and use in both elections
rho <- .71
sigma <- .069 
inc.impute <- .75 #for uncontested races

n.sims <- 1000
##################################################################
#PREDICTING THE 2008 SEATS VOTES CURVE
  #plot 2006 and 2008 together
#library(foreign)
#get 2006 Data
#use 2004 vars from 2006 data to account for GA switch of 3rd and 8th districts
    #for s-v analyis
house.2006 <- read.dta("2006_house_data.dta", convert.underscore = T)
detach()
attach.all(house.2006)
i2006 <- house.2006[,"incumb06"]#incumbency in 2006
unc2006 <- house.2006[,"uncontested06"]#uncontested in 2006
dvote.2006 <- house.2006$dvoteimputed#imputed vote in 2006
adv.2006 <- mean(dvote.2006) #mean avg. district vote (imputed)
winner.2006 <- house.2006$winner#winner of each race (1 for Dems, 0 for GOP
dem.seats.2006 <- mean(winner.2006) #percent of Dem seats (233/435); counting FL-13 for GOP
v2004 <- dlag06imputed #use imputed lagged vote to get 2004 vote for 2006 s-v prediction
i2004 <- incumb04 #incumbency in 04 for s-v prediction

#use same range for 2006 and 2008
vbar.range <- seq(.35,.65, .002) #create range from 35 to 65

vbar.2004 <- mean(v2004)
sbar.50.2006 <- rep (NA, length(vbar.range))#vector for predicted seats (based on medians)
prob.2006 <- rep (NA, length(vbar.range)) #set up vector for prob. of winning house
for (j in 1:length(vbar.range)){#loop over intervals of vbar
 vbar <- vbar.range[j]
 sbar <- rep (NA, n.sims) 
 for (i in 1:n.sims){
   v.adj2004 <- v2004 - phi*i2004 #adjusted vote, taking out incumbency
   normvote2006 <- .5 + rho*(v.adj2004 - .5) #normal vote
   locfree2006 <- normvote2006 + phi*i2006 #location free: normal vote plus adjusted vote
   locfreenoisy2006 <- rnorm(length(locfree2006), locfree2006, sigma) #add noise to loc.free var
   withuncs2006 <- ifelse (unc2006==-1, 1-inc.impute,
                           ifelse (unc2006==1, inc.impute, locfreenoisy2006))
   swingfree2006 <- withuncs2006 + mean(v2004) - mean(withuncs2006) #take out swing
   v2006 <- swingfree2006 + vbar - mean(swingfree2006) 
   #V2006 <- withuncs2006 + vbar + mean(v2004) - mean(withuncs2006) - mean(swingfree2006) 
   sbar[i] <- mean(v2006>.5)
 }
 sbar.50.2006[j] <- mean(sbar)
 prob.2006[j] <- pnorm((sbar.50.2006[j] - 0.5)/rmse.predict)#use empirical predictive error for historical regressions (above)
}
#get v.bars when prob. == 10, 50, 90%
ten.percent.value.2006 <- mean(vbar.range[round(prob.2006,1) ==.10])
fifty.percent.value.2006 <- mean(vbar.range[round(prob.2006,1) ==.50])
ninety.percent.value.2006 <- mean(vbar.range[round(prob.2006,1) ==.90])
#get predicted seats and probability based on adv. for 2006
pred.seats.2006 <- mean(sbar.50.2006[vbar.range==round(adv.2006,3)])
seats.error.2006 <- dem.seats.2006 - pred.seats.2006
bias.2006 <- mean(2*(sbar.50.2006[round(vbar.range,2)==.50]-.5))
predicted.prob.2006 <- prob.2006[vbar.range == round(adv.2006,3)]
#get probability dems would have won house if they got the same adv as GOP in 1994
predicted.prob.1994 <- prob.2006[vbar.range == round(1-adv[unique.year.all==1994],2)]
#get number of seats GOP would have won with dems average district vote.
gop.seats.pred <- 1-mean(sbar.50.2006[round(vbar.range,3)==round(1-adv.2006,3)])
print(ten.percent.value.2006)
print(fifty.percent.value.2006)
print(ninety.percent.value.2006)
print(predicted.prob.2006)  
dem.50.seats.2006 <- mean(sbar.50.2006[round(vbar.range,1) ==.50])

######################################################
#read in 2008 data
house.2008 <- read.dta("2008_house_updated_pre_election.dta", convert.underscore = T)
detach()
attach.all(house.2008)

i2008 <- incumb.08#incumbency in 2008
unc2008 <- uncontested.08#uncontested in 2008

vbar.2006 <- mean(v2006)
sbar.50.2008 <- rep (NA, length(vbar.range))#vector for predicted seats (based on medians)
prob.2008 <- rep (NA, length(vbar.range)) #set up vector for prob. of winning house
for (j in 1:length(vbar.range)){#loop over intervals of vbar
 vbar <- vbar.range[j]
 sbar <- rep (NA, n.sims) 
 for (i in 1:n.sims){
   v.adj2006 <- v2006 - phi*i2006 #adjusted vote, taking out incumbency
   normvote2008 <- .5 + rho*(v.adj2006 - .5) #normal vote
   locfree2008 <- normvote2008 + phi*i2008 #location free: normal vote plus adjusted vote
   locfreenoisy2008 <- rnorm(length(locfree2008), locfree2008, sigma) #add noise to loc.free var
   withuncs2008 <- ifelse (unc2008==-1, 1-inc.impute,
                           ifelse (unc2008==1, inc.impute, locfreenoisy2008))
   swingfree2008 <- withuncs2008 + mean(v2006) - mean(withuncs2008) #take out swing
   v2008 <- swingfree2008 + vbar - mean(swingfree2008) 
   #V2008 <- withuncs2008 + vbar + mean(v2006) - mean(withuncs2008) - mean(swingfree2008) 
   sbar[i] <- mean(v2008>.5)
 }
 sbar.50.2008[j] <- mean(sbar)
 prob.2008[j] <- pnorm((sbar.50.2008[j] - 0.5)/rmse.predict)#use empirical predictive error for historical regressions (above)
}
#get v.bars when prob. == 10, 50, 90%
ten.percent.value.2008 <- mean(vbar.range[round(prob.2008,1) ==.10])
fifty.percent.value.2008 <- mean(vbar.range[round(prob.2008,1) ==.50])
ninety.percent.value.2008 <- mean(vbar.range[round(prob.2008,1) ==.90])
print(ten.percent.value.2008)
print(fifty.percent.value.2008)
print(ninety.percent.value.2008)

#get predicted seats for dems if they get x% of adv
dem.48.seats <- mean(sbar.50.2008[round(vbar.range,2) ==.48])
dem.50.seats <- mean(sbar.50.2008[round(vbar.range,2) ==.50])
dem.52.seats <- mean(sbar.50.2008[round(vbar.range,2) ==.52])
dem.55.seats <- mean(sbar.50.2008[round(vbar.range,2) ==.55])
dem.58.seats <- mean(sbar.50.2008[round(vbar.range,2) ==.58])

#get prediction for how many gop seats they would win if they got same adv as Dems in 2006
  #use dems and then take inverse

adv.2006.reverse <- 1-adv[31]
dem.2006.seats.hypothetical <- mean(sbar.50.2008[round(vbar.range,2) == round(adv.2006.reverse,2)])
gop.predicted.2006.adv <- 1-dem.2006.seats.hypothetical
########################
#plot both curves and probabilities together
axis.size <- 1.1 #control size of axis tick marks and numbers
axis.label <- 1.1 #control size of axis labels
lower.bound <- .4#for axis limits
upper.bound <- .6

pdf("Seats_Votes_2006_2008.pdf", height = 4, width = 8)

#Predicted SV Curves
  #2006
par (mfrow=c(1,2), mar = c(7,6,2,3), pty = "s")
plot(vbar.range, sbar.50.2006, type = "n", 
    ylab = "",  xlab = "", axes = F, xaxs="i", yaxs="i",
    ylim = c(lower.bound, upper.bound), xlim = c(lower.bound,upper.bound),, 
    main = "")
#lines(lowess(vbar.range.2006, sbar.50.2006))
axis(1, at = c(.40,.45, .50,.55,.60), label = c("40%", "", "50%", "", "60%"), mgp = c(2,.5,0), cex.axis = axis.size)
axis(2, at = c(.40,.45, .50,.55,.60), label = c("40%", "", "50%", "", "60%"), las = 2, mgp = c(2,.5,0), cex.axis = axis.size)
mtext("Average district vote\nfor Democrats", 1, cex = axis.label, line = 3.1)
mtext("Democratic share\nof House seats", 2, cex = axis.label, line=3.1)
abline(h=.5,  col="gray", lwd=.5)
abline(v=.5,  col="gray", lwd=.5)
points(adv.2006, dem.seats.2006, type = "p", pch = 19, cex = .8, col = "blue")
box()
#add curves
points(vbar.range, sbar.50.2006, type = "l", lty = 2, col = "blue")#2006  
points(vbar.range, sbar.50.2008, type = "l", lty = 1, col = "dark green")#2008
#label curves
text(.53, .48, "2006")
text(.515, .58, "2008")

#PROBABILITY CURVES
par (mar = c(7,6,2,3))
plot (vbar.range, prob.2006, type="n", 
    main = "", 
    xlab ="", ylab ="", axes = F,  xaxs="i",
     yaxs="i", xlim = c(lower.bound,upper.bound), ylim = c(0,1))
axis(1, at = c(.40,.45, .50,.55,.60), label = c("40%", "", "50%", "", "60%"), mgp = c(2,.5,0), , cex.axis = axis.size)
axis(2, at = c(seq(0,1,by =.25)), label = c(0,".25",".5",".75",1), las = 2, mgp = c(2,.5,0), cex.axis = axis.size)
mtext("Average district vote\nfor Democrats", 1, cex = axis.label, line = 3.1)
mtext("Probability Democrats\ncontrol House", 2, cex = axis.label, line =2.7)
abline (h=.5,col="gray", lwd=.5)#50% line
abline (h=.1, col="gray", lwd=.5)#10 % line
abline (h=.9, col="gray", lwd=.5)#90% line
#add curves
points(vbar.range, prob.2006, type="l", lty =2,  col = "blue")#2006
points(vbar.range, prob.2008, type="l", lty = 1, col = "dark green")#2008
#add point for actual avd vote and predicted probabloty
points(adv.2006, predicted.prob.2006, type = "p", pch = 19, cex = .8, col = "blue")
text(.56, .76, "2006")
text(.475, .63, "2008")
box()

dev.off()
#########################

#Figure 2: plot time series, then add in predicted seats for dems based on various adv's.
year.at.vec <- c((seq(1946, 2008, by = 2)))#use to plot tick marks on x-axis
year.label.vec <- c("", "", "1950", "","","", "", "1960", "","","", "", "1970",
    "","","", "", "1980", "","","", "", "1990", "","","", "", "2000","", "",  "2008")#x-axis labels

pdf("avd_vs_seats_allyears.pdf", height =5, width = 11)

par(mfrow=c(1,1), mar = c(2,3,.5,0))#set up margins for plot
plot(unique.year.all, adv, axes = F, type = "n", xlab = "", 
    ylab = "", xlim = c(1946, 2013), ylim = c(.43,.68), xaxs="i", yaxs="i",
    main = "")#call empty plot so can add shading first, or else shading would block out lines
#add shading for different periods of party control GOP: 1946-1948, 1952-54, 1994-2006
shade.color = "gray92"
polygon(x=c(1946, 1946, 1948, 1948),
    y=par()$usr[c(3,4,4,3)],
    col= shade.color,   ## desired color
    border=F)         ## no border
polygon(x=c(1952, 1952, 1954, 1954),
    y=par()$usr[c(3,4,4,3)],
    col= shade.color,   ## desired color
    border=F)         ## no border
polygon(x=c(1994, 1994, 2006, 2006),
    y=par()$usr[c(3,4,4,3)],
    col= shade.color,   ## desired color
    border=F)         ## no border
points(unique.year.all, adv, type = "l")#points/line for avg. district vote
points(unique.year.all, dem.seats, type = "l", lty = 2)##points/line for dem percent of seats
axis(1,  at = year.at.vec, labels = NA, las = 1)
axis(1,  at = c(1950, 1960, 1970, 1980, 1990, 2000, 2008), 
    labels = c(1950, 1960, 1970, 1980, 1990, 2000,2008), las = 1, cex.axis = 1.2, mgp = c(2,.7,0))
axis(2, las = 1, at = c(seq(.4, .70, by = .05)), 
    label = c("40", "45%", "50%", "55%", "60%", "65%", "70%"), cex.axis = 1.2,mgp = c(2,.7,0))
segments(1946, .5, 2008, .5,  col="gray", lwd=.5)#light line for 50%
text(1976, .515, "Average district vote\nfor Democrats", cex = 1.2)#labels for each line
text(1984, .66, "Democrats' percentage\n of House seats", cex = 1.2)#labels for each line
#get starting points for extending lines to 2008
adv.start <-adv[31]
dem.seats.start <- dem.seats[31]
end.point <- 2008
points(2006, dem.seats.start, pch = 19)
#.48
segments(2006, dem.seats.start, end.point, dem.48.seats, lty =4)
text(end.point + .3, dem.48.seats,"48% vote\nshare ", adj = 0)
points(end.point, dem.48.seats,pch=19)
#.50
segments(2006, dem.seats.start, end.point, dem.50.seats, lty =4)
text(end.point + .3, dem.50.seats,"50% vote\nshare ", adj = 0)
points(end.point, dem.50.seats,pch=19)
#.52
segments(2006, dem.seats.start, end.point, dem.52.seats, lty =4)
text(end.point + .3, dem.52.seats,"52% vote\nshare ", adj = 0)
points(end.point, dem.52.seats,pch=19)
#.55
segments(2006, dem.seats.start, end.point, dem.55.seats, lty =4)
text(end.point + .3, dem.55.seats,"55% vote\nshare ", adj = 0)
points(end.point, dem.55.seats,pch=19)
#.58
segments(2006, dem.seats.start, end.point, dem.58.seats, lty =4)
text(end.point + .3, dem.58.seats,"58% vote\nshare ", adj = 0)
points(end.point, dem.58.seats,pch=19)

dev.off()


#for 1994-2006, depict actual gop adv, actual seats, predicted seats given 2008 seats curve
  #then do same for Dems
keep <- unique.year.all>=1994 & unique.year.all <= 2006
n.elections <- length(seq(1994,2006,2))
dem.adv.9406 <- adv[keep]#keep in terms of dems for consistency with SV curve
gop.seats.actual.9406 <- 435 - dem.seats.actual[keep]
dem.seats.actual.9406 <- dem.seats.actual[keep]
gop.seats.predicted.9406 <- rep(NA,n.elections)
dem.seats.predicted.9406 <-  rep(NA,n.elections)
  for (i in 1:n.elections){
    dem.predicted.percent.seats <-  mean(sbar.50.2008[vbar.range==round(dem.adv.9406[i],2)])
    dem.seats.predicted.9406[i] <- round(dem.predicted.percent.seats*435,0)
    gop.seats.predicted.9406[i] <- 435 - dem.seats.predicted.9406[i]
}

print(cbind(1-dem.adv.9406, gop.seats.actual.9406, gop.seats.predicted.9406))
print(cbind(dem.adv.9406, dem.seats.actual.9406, dem.seats.predicted.9406))
gop.matrix <- rbind(gop.seats.actual.9406, gop.seats.predicted.9406)
dem.matrix <- rbind(dem.seats.actual.9406, dem.seats.predicted.9406)
gop.x.label = c("1994\n(.52)", "1996\n(.49)", "1998\n(.49)", "2000\n(.48)", "2002\n(.50)", "2004\n(.49)", "2006\n(.45)")
dem.x.label = c("1994\n(.48)", "1996\n(.51)", "1998\n(.51)", "2000\n(.52)", "2002\n(.50)", "2004\n(.51)", "2006\n(.55)")


pdf("actual_predicted_seats_94_06.pdf", height = 6, width = 6.5)

#gop
par(mfrow=c(1,1), mar=c(4,3.5,2.2,1))
gop.plot <- barplot(gop.matrix, main = "", ylim = c(170, 245),
    xlab = "", beside=T, 
    axisnames=FALSE, axes =F, horiz=F, xpd = F, #density = 100,
    col = c("gray50", "gray90"))
axis(1, at=gop.plot[1,]+.5, labels = gop.x.label, cex = 1, las = 1, tick = T, mgp = c(2,1.5,0))
abline(h=170)
axis(2, at = c(seq(170,260,10),265), label = c(seq(170,260,10),""),  las = 1, tick = T, mgp = c(2,.5,0))
box(bty = "l")
mtext("Republican Seats", 2, line = 2.5, cex = 1.2)
text(gop.plot[1,1],gop.matrix[1,1]+1, "Actual", adj = 0, srt = 55)
text(gop.plot[2,1],gop.matrix[2,1]+1, "Hypothetical", adj = 0, srt = 55, xpd=T)
abline(h= 218, lty = 2)#horiz line for majority
text(19.5,220, "Majority (218)")

dev.off()
